#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.6 - 15-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################
######## export PYTHON_EGG_CACHE=/tmp
import pprint
import os
import nltk
# import rocksdb # shared library kann aktuell noch nicht gelesen werden
import MySQLdb # apt-get install python-mysqldb
from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/
from sphinxit.core.helpers import BaseSearchConfig
from random import randint
from past.builtins import basestring # pip install future
import codecs
import sys
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
import re
from transliterate import translit, get_available_language_codes
import libleipzig
import pprint
import json
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
###python -m nltk.downloader -d /usr/share/nltk_data all
####python -m nltk.downloader all
###########nltk.download()
# nltk.download("punkt")
reload(sys)
sys.setdefaultencoding('utf-8')
noDoubleHash = set()
# lies die Ein und Ausgabedateien
inputfile = sys.argv[1]
outputfile = sys.argv[2]
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
#text.decode('utf-8')
text = text.decode("utf-8")
class SphinxitConfig(BaseSearchConfig):
DEBUG = False
WITH_META = False
WITH_STATUS = False
POOL_SIZE = 5
# SQL_ENGINE = 'oursql'
SEARCHD_CONNECTION = {
'host': '127.0.0.1',
'port': 9977,
}
# delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
# http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html
# https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py
# http://www.tutorialspoint.com/python/python_database_access.htm
# mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working
sphinx = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=9977) # sphinxQL
cursorSphinx = sphinx.cursor()
mysql = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=3306) # Mysql
mysql.autocommit(True)
cursorMysql = mysql.cursor()
def log_warnings(curs):
for msg in curs.messages:
if msg[0] == MySQLdb.Warning:
logging.warn(msg[1])
def deumlaut(s):
"""
Replaces umlauts with fake-umlauts
"""
s = s.replace('\xdf', 'ss')
s = s.replace('\xfc', 'ue')
s = s.replace('\xdc', 'Ue')
s = s.replace('\xf6', 'oe')
s = s.replace('\xd6', 'Oe')
s = s.replace('\xe4', 'ae')
s = s.replace('\xc4', 'Ae')
return s
def summarizeText(s):
## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers
sentences = nltk.sent_tokenize(s)
sentenceCount = len(sentences)
randSentenceCount = randint(int((sentenceCount/100)*75), sentenceCount)
# randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount))
parser = PlaintextParser.from_string(s, Tokenizer("german"))
stemmer = Stemmer("german")
# summarizer = TextRankSummarizer(stemmer)
summarizer = Summarizer(stemmer)
summary = summarizer(parser.document, randSentenceCount)
returnText = ""
#ISO-8859-1
for sentence in summary:
returnText += str(sentence)
returnText += " "
return returnText
# Todos:
# create a stopword list in German
# if a stopword is part of a synonym
# give bad minus points
def SynRanker(s,t):
if not s or not t:
return -10
else:
1
if not isinstance(s, basestring) or not isinstance(t, basestring):
return -10
else:
1
startVal = float(1.0)
lenSyn = len(s)
synHasDigits = any(i.isdigit() for i in s)
synhasSonder = False
delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
re_sonder = r"(\?|\.|\,|\;|\:|\!|\d)"
re_space = r"(\t|\r|\n|\s|\w)"
firstS = s[0:1]
firstT = t[0:1]
if s == t:
startVal -= -0.95
return -1
else:
1
if lenSyn <= 0:
startVal -= -0.99
return -10
else:
1
if lenSyn >= 3 and lenSyn < 14:
startVal += 0
elif lenSyn < 3:
startVal -= 0.35
else:
1
if (' ' in s) and lenSyn >= 14:
startVal -= 0.75
elif (' ' in s) and lenSyn < 14:
startVal -= 0.55
elif (' ' not in s) and lenSyn >= 14:
startVal -= 0.05
elif (' ' not in s) and lenSyn < 14:
startVal += 0.05
else:
1
if re.search(re_space, s) is not None:
startVal -= 0.50
else:
1
if re.search(re_sonder, s) is not None:
startVal -= 0.075
synhasSonder = True
else:
1
if firstS.isupper() and firstT.isupper():
startVal += 0.15
elif firstS.islower() and firstT.islower():
startVal += 0.15
elif firstS.isupper() and not firstT.isupper():
startVal -= 0.25
elif firstS.islower() and not firstT.islower():
startVal -= 0.25
else:
1
#print("Synonym: ", s)
#print("
")
#print("Length: ", lenSyn)
#print("
")
# print("Digits: ", synHasDigits)
#print("
")
#print("Space: ", (' ' in s))
#print("
")
#print("Sonderzeichen: ", synhasSonder)
#print("
")
#print("SynRank: ", startVal)
#print("
")
#print("---------------------------------------------------
")
# later ResultCodes
return float(startVal)
def iround(x):
"""iround(number) -> integer
Round a number to the nearest integer."""
return int(round(x) - .5) + (x > 0)
def getSynLeipzig(sl):
#print ("Auto Syn - Leipzig: ", libleipzig.Thesaurus("Auto",10))
retContent = []
retSaveMysql = "W:"+sl
if not sl:
return retContent
elif not isinstance(sl, basestring):
return retContent
elif len(sl) < 3:
return retContent
synLeipzig = libleipzig.Thesaurus(sl, 50)
if not synLeipzig:
return retContent
else:
for aSyn in synLeipzig:
retContent.append(str(aSyn[0]))
retSaveMysql += ";S:"+(str(aSyn[0]))
if len(retSaveMysql) > 5:
raw = json.dumps(retSaveMysql)
loggit = "INSERT INTO synonym_leipzig(raw,uid) VALUES(%s, %s)"
try:
cursorMysql.execute(loggit, (raw, 0))
mysql.commit()
except MySQLdb.ProgrammingError:
print("Function -getSynLeipzig()- failed: The following mysql query failed:")
print(loggit)
data = []
return retContent
# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy = summarizeText(text)
tokens = nltk.word_tokenize(tSumy)
tokensRaw = nltk.word_tokenize(tSumy)
count = -1
changeEveryWord = 9 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag = 0
changeEveryWordTemp = 0 #temporary upcount
for word in tokens:
count += 1
# cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (word))
name_content = cursorMysql.fetchone()
#print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
#print (name_content)
# search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
# # search_query = search_query.match(word).options(
# search_query = search_query.match(word).options(
# ranker='proximity_bm25',
# max_matches=1,
# max_query_time=350,
# field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
# )
###sphinx_result = search_query.ask()
# exit(0)
# es wurde ein namen gefunden -> kein synonym austauschen
if name_content is not None:
# print("Token: ", tokens)
#print("Count: ", count)
#print("
")
#print("Tokencount overall: ", len(tokens))
#print("
")
tokens[count] = '' + deumlaut(word) + ''
tokensRaw[count] = deumlaut(word)
# print "Namen erkannt und nicht getauscht"
continue
else:
1
if changeEveryWordTemp == (changeEveryWord - 1):
changeEveryWordFlag = 0
changeEveryWordTemp = 0
else:
1
if changeEveryWordFlag == 1:
changeEveryWordTemp += 1
else:
1
if len(word) >= 2 and changeEveryWordFlag == 0:
# Versuche zuerst die Leipzig DB anzufordern
lstcWord = word[0:1]
synDictLeipzig = {}
sLeipzigList = getSynLeipzig(word)
if sLeipzigList:
for wSynL in sLeipzigList:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSynL not in noDoubleHash:
synDictLeipzig[wSynL] = SynRanker(wSynL, word)
sortedSynList = []
sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)
firstBestSynHit = str(sortedSynList[0][0])
firstBestSynHitRank = str(sortedSynList[0][1])
# later: Randomly choose one of the synonyms that have all the highest rating
tokens[count] = '' + deumlaut(firstBestSynHit) + ''
noDoubleHash.add(firstBestSynHit)
tokensRaw[count] = deumlaut(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
else:
#nutze unsere lokale Synonym Mysql Datenbank
search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig)
search_query_syn = search_query_syn.match(word).options(
ranker='proximity_bm25',
max_matches=1,
max_query_time=350,
field_weights={'synonyms': 100},
)
sphinx_result_syn = search_query_syn.ask()
synID = 0
try:
synID = sphinx_result_syn['result']['items'][0].values()[0]
if synID > 0:
# print "SynDB has been found: ", synID
#später finde via sphinx noch mehr synonyme und parse diese alle
sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
cursorMysql.execute(sql)
syn_content = cursorMysql.fetchone()
synContent = list(syn_content)
synContent = synContent[0].decode(encoding="utf-8", errors="ignore")
if syn_content:
synwords = synContent.split(";")
# print SynDictCalculator(synwords)
# http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
# for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
# print "%s: %s" % (key, value)
synDict = {}
for wSyn in synwords:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSyn not in noDoubleHash:
synDict[wSyn] = SynRanker(wSyn, word)
sortedSynList = []
sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
firstBestSynHit = str(sortedSynList[0][0])
firstBestSynHitRank = str(sortedSynList[0][1])
# later: Randomly choose one of the synonyms that have all the highest rating
tokens[count] = '' + deumlaut(firstBestSynHit) + ''
noDoubleHash.add(firstBestSynHit)
tokensRaw[count] = deumlaut(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
#break
except IndexError:
print
# file schreiben
outputtext = ' '.join(tokens)
outputtextRussia = ' '.join(tokensRaw)
with codecs.open(outputfile, 'w') as f:
f.write(outputtext)
f.write("
")
f.write("RUSSISCHE TRANSLITERATION:BEISPIEL VERSION")
f.write("
")
f.write(translit(outputtextRussia, 'ru'))
f.close()
mysql.commit()
mysql.close()
exit(0)
# re_sonder = r"[(\?|\.|\!)]$(\)"
#re_space = r"(\t|\r|\n|\s|\w)"